Text


In [ ]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer

sgd = SGDClassifier()
hashing_vectorizer = HashingVectorizer()

for i in range(10):
    data_batch = pd.read_csv("data/train_%d.csv" % i)
    text_batch = data_batch.Comment.tolist()
    y_batch = data_batch.Insult.values
    X_batch = hashing_vectorizer.transform(text_batch)
    sgd.partial_fit(X_batch, y_batch, classes=range(10))

In [ ]:
data_test = pd.read_csv("data/test_with_solutions.csv")
X_test = hashing_vectorizer.transform(data_test.Comment.tolist())
y_test = data_test.Insult.values
sgd.score(X_test, y_test)

Kernel Approximations


In [ ]:
from sklearn.kernel_approximation import RBFSampler
import cPickle

sgd = SGDClassifier()
kernel_approximation = RBFSampler(gamma=.001, n_components=400)

for i in range(9):
    X_batch, y_batch = cPickle.load(open("data/batch_%02d.pickle" % i))
    if i == 0:
        kernel_approximation.fit(X_batch)
    X_transformed = kernel_approximation.transform(X_batch)
    sgd.partial_fit(X_transformed, y_batch, classes=range(10))

In [ ]:
X_test, y_test = cPickle.load(open("data/batch_09.pickle"))

sgd.score(kernel_approximation.transform(X_test), y_test)

In [ ]: